bts = readRDS('bts.rds')
bcg = readRDS('bcg.rds')
bsa = readRDS('bsa.rds')
bsg = readRDS('bsg.rds')
bsr = readRDS('bsr.rds')
bst = readRDS('bst.rds')
btm = readRDS('btm.rds')
timss = readRDS('timss_2015.rds')
با توجه به اینکه مقدار pvalue بسیار کم شده است در نتیجه تحصیلات پدر و مادر بر روی نمرههای بچهها تاثیر گذار است که این موضوع به وضوح در نمودارهای رسم شده قابل مشاهده است.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(highcharter)
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
av_grade = bsg %>% mutate(grade = (bsmmat01 + bsmmat02 + bsmmat03 + bsmmat04 + bsmmat05)/5)
gra_edu = merge(av_grade %>% select(idstud, idcntry, idschool, grade), bsg %>% select(mother = bsbg07a, father = bsbg07b, idstud, idcntry, idschool), by=c("idstud", "idcntry", "idschool"))
gra_edu = gra_edu[!(is.na(gra_edu$mother)),]
gra_edu = gra_edu[!(is.na(gra_edu$father)),]
good_father = gra_edu %>% filter(father > 4 & father < 9)
bad_father = gra_edu %>% filter(father > 0 & father < 5)
t.test(good_father$grade, bad_father$grade)
##
## Welch Two Sample t-test
##
## data: good_father$grade and bad_father$grade
## t = 88.314, df = 222970, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 35.54291 37.15635
## sample estimates:
## mean of x mean of y
## 493.9241 457.5745
ggplot() + geom_density(data = good_father, aes(x = grade, fill = "Educated"), alpha = 0.5) + geom_density(data = bad_father, aes(x = grade, fill = "Uneducated"), alpha = 0.5) + labs(title = "Father grade density", x = "Grade", y = "Density", fill = "Education Level")
highchart() %>% hc_add_series(density(good_father$grade), type = "area", name = "Educated Father") %>% hc_add_series(density(bad_father$grade), type = "area", name = "Uneducated Father") %>% hc_title(text = "Father grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Grade"))
good_mother = gra_edu %>% filter(mother > 4 & mother < 9)
bad_mother = gra_edu %>% filter(mother > 0 & mother < 5)
t.test(good_mother$grade, bad_mother$grade)
##
## Welch Two Sample t-test
##
## data: good_mother$grade and bad_mother$grade
## t = 104.51, df = 245500, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 41.41611 42.99924
## sample estimates:
## mean of x mean of y
## 497.8699 455.6622
ggplot() + geom_density(data = good_mother, aes(x = grade, fill = "Educated"), alpha = 0.5) + geom_density(data = bad_mother, aes(x = grade, fill = "Uneducated"), alpha = 0.5) + labs(title = "Mother grade density", x = "Grade", y = "Density", fill = "Education Level")
highchart() %>% hc_add_series(density(good_mother$grade), type = "area", name = "Educated Mother") %>% hc_add_series(density(bad_mother$grade), type = "area", name = "Uneducated Mother") %>% hc_title(text = "Mother grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Grade"))
با توجه به اینکه مقدار pvalue به دست آمده بسیار کم است در نتیجه میزان امکانات بر نمره بچه ها تاثیر گذار است که این موضوه با توجه به نمودارهای رسم شده به وضوح قابل دیدن است.
facilities = av_grade %>% mutate(total_facility = 22 - bsbg06a - bsbg06b - bsbg06c - bsbg06d - bsbg06e - bsbg06f - bsbg06g - bsbg06h - bsbg06i - bsbg06j - bsbg06k) %>% select(total_facility, grade)
facilities = facilities[!(is.na(facilities$total_facility)),]
good_facilities = facilities %>% filter(total_facility > 6 & total_facility < 12)
bad_facilities = facilities %>% filter(total_facility > 0 & total_facility < 7)
t.test(good_facilities$grade, bad_facilities$grade)
##
## Welch Two Sample t-test
##
## data: good_facilities$grade and bad_facilities$grade
## t = 91.921, df = 124460, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 47.11368 49.16661
## sample estimates:
## mean of x mean of y
## 493.1770 445.0368
ggplot() + geom_density(data = good_facilities, aes(x = grade, fill = "High"), alpha = 0.5) + geom_density(data = bad_facilities, aes(x = grade, fill = "Low"), alpha = 0.5) + labs(title = "Facility grade density", x = "Grade", y = "Density", fill = "Facility Level")
highchart() %>% hc_add_series(density(good_facilities$grade), type = "area", name = "High facility") %>% hc_add_series(density(bad_facilities$grade), type = "area", name = "Low facility") %>% hc_title(text = "Facility grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Grade"))
با توجه به اینکه مقدار pvalue بسیار کوچک است در نتیجه میزان امنیت بر روی نمرات تاثیر گذار است که میتوان این موضوع را به راحتی در نمودارها مشاهده کرد.
safety_in_school = av_grade %>% select(safety = bsbg15b, grade)
safety_in_school = safety_in_school[!(is.na(safety_in_school$safety)),]
safety_group = safety_in_school %>% group_by(safety) %>% summarise(grade = mean(grade))
safety1 = safety_in_school %>% filter(safety == 1)
safety2 = safety_in_school %>% filter(safety == 2)
safety3 = safety_in_school %>% filter(safety == 3)
safety4 = safety_in_school %>% filter(safety == 4)
safety_in_school$safety = factor(safety_in_school$safety, levels = 1:4)
aov_test = aov(grade ~ safety, safety_in_school)
summary(aov_test)
## Df Sum Sq Mean Sq F value Pr(>F)
## safety 3 3.928e+07 13093132 1197 <2e-16 ***
## Residuals 274074 2.999e+09 10943
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot() + geom_density(data = safety1, aes(x = grade, fill = "Very safe"), alpha = 0.5) + geom_density(data = safety2, aes(x = grade, fill = "Safe"), alpha = 0.5) + geom_density(data = safety3, aes(x = grade, fill = "Not safe"), alpha = 0.5) + geom_density(data = safety4, aes(x = grade, fill = "Dangerous"), alpha = 0.5) + labs(title = "Safety grade density", x = "Grade", y = "Density", fill = "Safety Level")
highchart() %>% hc_add_series(density(safety1$grade), type = "area", name = "Very safe") %>% hc_add_series(density(safety2$grade), type = "area", name = "Safe") %>% hc_add_series(density(safety3$grade), type = "area", name = "Not safe") %>% hc_add_series(density(safety4$grade), type = "area", name = "Dangerous") %>% hc_title(text = "Safety grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Grade"))
با توجه به اینکه مقدار pvalue بزرگ شده است در نتیجه نمیتوان فرض صفر را رد کرد. این موضوع با توجه به نمودار نیز واضح است زیرا میانگین دو دسته دخترها و پسرها به یکدیگر نزدیک است.
girls_geo = timss %>% filter(content_domain == "Geometry", cognitive_domain == "Applying") %>% select(correct = correct_ratio_per_question_female)
boys_geo = timss %>% filter(content_domain == "Geometry", cognitive_domain == "Applying") %>% select(correct = correct_ratio_per_question_male)
ggplot() + geom_density(data = boys_geo, aes(x = correct, fill = "Boys"), alpha = 0.5) + geom_density(data = girls_geo, aes(x = correct, fill = "Girls"), alpha = 0.5) + labs(title = "Sex geometry grade density", x = "Correct ratio", y = "Density", fill = "Sex")
highchart() %>% hc_add_series(density(boys_geo$correct), type = "area", name = "Boys") %>% hc_add_series(density(girls_geo$correct), type = "area", name = "Girls") %>% hc_title(text = "Sex geometry grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Correct ratio"))
t.test(boys_geo, girls_geo)
##
## Welch Two Sample t-test
##
## data: boys_geo and girls_geo
## t = -0.23614, df = 1493.7, p-value = 0.8134
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02430046 0.01907826
## sample estimates:
## mean of x mean of y
## 0.4086124 0.4112235
با توجه به اینکه مقدار pvalue بسیار کوچک شده در نتیجه فرض صفر رد میشود و نمره بچهها به وضعیت تغذیه آنها مرتبت است که این مساله در نمودار واضح است.
school_grade = av_grade %>% group_by(idcntry,idschool) %>% summarise(grade = mean(grade))
meals = merge(school_grade %>% select(idcntry, idschool, grade), bcg %>% select(breakfast = bcbg06a, lunch = bcbg06b , idcntry, idschool), by=c("idcntry", "idschool")) %>% mutate(meals_count = breakfast + lunch)
meals = meals[!(is.na(meals$meals_count)),]
ggplot() + geom_density(data = meals, aes(x = grade, fill = meals_count < 4), alpha = 0.5)
meals$meals_count = factor(meals$meals_count, levels = 2:6)
aov_test = aov(grade ~ meals_count, meals)
summary(aov_test)
## Df Sum Sq Mean Sq F value Pr(>F)
## meals_count 4 1710650 427662 63.92 <2e-16 ***
## Residuals 7237 48418895 6690
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
با توجه به اینکه مقدار pvalue به دست آمده بسیار کوچک است در نتیجه فرض صفر را رد میکنیم. در واقع تعداد غیبتها بر نمرات بچهها تاثیر میگذارد که این موضوع به وضوح در نمودارهای کشیده شده مشهود است.
absents = av_grade %>% select(absent = bsbg11, grade)
absent1 = absents %>% filter(absent == 1)
absent2 = absents %>% filter(absent == 2)
absent3 = absents %>% filter(absent == 3)
absent4 = absents %>% filter(absent == 4)
absents$absent = factor(absents$absent, levels = 1:4)
aov_test = aov(grade ~ absent, absents)
summary(aov_test)
## Df Sum Sq Mean Sq F value Pr(>F)
## absent 3 2.687e+08 89568437 8687 <2e-16 ***
## Residuals 264981 2.732e+09 10311
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 15145 observations deleted due to missingness
ggplot() + geom_density(data = absent1, aes(x = as.double(grade), fill = "Once a week or more"), alpha = 0.5) + geom_density(data = absent2, aes(x = as.double(grade), fill = "Once every two weeks"), alpha = 0.5) + geom_density(data = absent3, aes(x = as.double(grade), fill = "Once a month"), alpha = 0.5) + geom_density(data = absent4, aes(x = as.double(grade), fill = "Never or almost never"), alpha = 0.5) + labs(title = "Absents grade density", x = "Grade", y = "Density", fill = "Absents Level")
highchart() %>% hc_add_series(density(absent1$grade), type = "area", name = "Once a week or more") %>% hc_add_series(density(absent2$grade), type = "area", name = "Once every two weeks") %>% hc_add_series(density(absent3$grade), type = "area", name = "Once a month") %>% hc_add_series(density(absent4$grade), type = "area", name = "Never or almost never") %>% hc_title(text = "Absents grade density") %>% hc_yAxis(title = list(text = "Density")) %>% hc_xAxis(title = list(text = "Grade"))
با توجه به مقدار pvalue به دست آمده. چون این مقدار نسبتا زیاد است در نتیجه نمیتوان فرض صفر را رد کرد. با توجه به نمودار هم میتوان فهمید که تفاوت چندانی در توزیع نمرات عملی و توصیفی نیست.
iran_app = bsg %>% filter(idcntry == 364) %>% mutate(apply_field = (bssapp01 + bssapp02 + bssapp03 + bssapp04 + bssapp05 + bsmapp01 + bsmapp02 + bsmapp03 + bsmapp04 + bsmapp05)/10, reason_field = (bsmrea01 + bsmrea02 + bsmrea03 + bsmrea04 + bsmrea05 + bssrea01 + bssrea02 + bssrea03 + bssrea04 + bssrea05)/10)
ggplot(iran_app) + geom_density(aes(x = as.double(reason_field), fill = "Reason"), alpha = 0.5) + geom_density(aes(x = as.double(apply_field), fill = "Apply"), alpha = 0.5) + labs(title = "Geometry grade destiny", x = "Grade", y="Destiny", fill = "Geometry")
highchart() %>% hc_add_series(density(iran_app$reason_field), type = "area", name = "Reason Field") %>% hc_add_series(density(iran_app$apply_field), type = "area", name = "Apply Field") %>% hc_title(text = "Geometry grade destiny") %>% hc_yAxis(title = list(text = "Destiny")) %>% hc_xAxis(title = list(text = "Grade"))
t.test(iran_app$reason_field, iran_app$apply_field, alt="greater")
##
## Welch Two Sample t-test
##
## data: iran_app$reason_field and iran_app$apply_field
## t = -0.40608, df = 12235, p-value = 0.6577
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -3.176549 Inf
## sample estimates:
## mean of x mean of y
## 451.7581 452.3870
سه گزاره جالب
در این گزاره تاثیر انگزه بچهها برای ادامه تحصیل بر روی نمرات آنها بررسی شده است که با توجه به اینکه pvalue بسیار کوچک است در نتیجه میتوان نتیجه گرفت که مقدار انگیزه بر نمرات تاثیر میگذارد که این موضوع در نمودار مشهود است.
motivations = av_grade %>% select(motivation_level = bsbg08, grade)
motivations = motivations[!(is.na(motivations$motivation_level)),]
motivations$motivation_level = factor(motivations$motivation_level, levels = 1:6)
aov_test = aov(grade ~ motivation_level, motivations)
summary(aov_test)
## Df Sum Sq Mean Sq F value Pr(>F)
## motivation_level 5 2.944e+08 58882860 5847 <2e-16 ***
## Residuals 271149 2.730e+09 10070
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot() + geom_density(data = motivations, aes(x = as.double(grade), fill = motivation_level), alpha = 0.5)
در این گزاره تاثیر تهدید شدن بچهها بر روی نمراتشان را مور بررسی قرار دادیم که با توجه به اینکه مقدار pvalue بسیار کم بود فرض صفر را رد میکنیم در نتیجه میزان تهدید بر نمرات بچهها تاثیر میگذارد که این موضوع در نمودار مشهود است.
threatened = av_grade %>% select(threatened_level = bsbg16i, grade)
threatened = threatened[!(is.na(threatened$threatened_level)),]
threatened$threatened_level = factor(threatened$threatened_level, levels = 1:4)
aov_test = aov(grade ~ threatened_level, threatened)
summary(aov_test)
## Df Sum Sq Mean Sq F value Pr(>F)
## threatened_level 3 9.830e+07 32765537 3057 <2e-16 ***
## Residuals 268144 2.874e+09 10718
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot() + geom_density(data = threatened, aes(x = as.double(grade), fill = threatened_level), alpha = 0.5)
در این گزاره تاثیر آسیب دیدن بچهها توسط دیگران بر روی نمراتشان را مور بررسی قرار دادیم که با توجه به اینکه مقدار pvalue بسیار کم بود فرض صفر را رد میکنیم در نتیجه میزان آسیب دیدن بر نمرات بچهها تاثیر میگذارد که این موضوع در نمودار مشهود است.
hurt = av_grade %>% select(hurt_level = bsbg16e, grade)
hurt = hurt[!(is.na(hurt$hurt_level)),]
hurt$hurt_level = factor(hurt$hurt_level, levels = 1:4)
aov_test = aov(grade ~ hurt_level, hurt)
summary(aov_test)
## Df Sum Sq Mean Sq F value Pr(>F)
## hurt_level 3 6.619e+07 22062880 2035 <2e-16 ***
## Residuals 267812 2.903e+09 10840
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot() + geom_density(data = hurt, aes(x = as.double(grade), fill = hurt_level), alpha = 0.5)